Classification with Logistic Regression



In [61]:

    
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline



In [62]:

    
# Load Iris dataset from csv file

data_url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/iris.csv'
df = pd.read_csv(data_url)



In [63]:

    
df.head()









    Out[63]:







  
    
      
      Unnamed: 0
      Sepal.Length
      Sepal.Width
      Petal.Length
      Petal.Width
      Species
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2
      setosa



In [64]:

    
# Let's remove first colum (Unnamed: 0)

# iloc means index location
# REMEMBER: Python is a zero-indexed programming language
# [:, 1:]  - : means select all rows; 1: - means select all columns starting from second column(index 1)

df = df.iloc[:, 1:]

df.head()









    Out[64]:







  
    
      
      Sepal.Length
      Sepal.Width
      Petal.Length
      Petal.Width
      Species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [65]:

    
# Let's rename the columns to remove '.' in names

new_columns = ["Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species"]

df.columns = new_columns



In [66]:

    
df.head()









    Out[66]:







  
    
      
      Sepal_Length
      Sepal_Width
      Petal_Length
      Petal_Width
      Species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [67]:

    
# Total number of unique Species
df['Species'].unique()









    Out[67]:





array(['setosa', 'versicolor', 'virginica'], dtype=object)



In [68]:

    
# Create a new column named Class and convert Species names into numeric labels

df['Class'] = df['Species'].map({'setosa': 0, 'versicolor': 1, 'virginica': 2})

df.head()









    Out[68]:







  
    
      
      Sepal_Length
      Sepal_Width
      Petal_Length
      Petal_Width
      Species
      Class
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
      0
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
      0
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
      0
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
      0
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa
      0



In [69]:

    
# Select 2 classes (0 and 1) for predictions
df_two = df[df['Class'] != 2]



In [70]:

    
# Get basic statistics

df_two.describe()









    Out[70]:







  
    
      
      Sepal_Length
      Sepal_Width
      Petal_Length
      Petal_Width
      Class
    
  
  
    
      count
      100.000000
      100.000000
      100.000000
      100.000000
      100.000000
    
    
      mean
      5.471000
      3.099000
      2.861000
      0.786000
      0.500000
    
    
      std
      0.641698
      0.478739
      1.449549
      0.565153
      0.502519
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
      0.000000
    
    
      25%
      5.000000
      2.800000
      1.500000
      0.200000
      0.000000
    
    
      50%
      5.400000
      3.050000
      2.450000
      0.800000
      0.500000
    
    
      75%
      5.900000
      3.400000
      4.325000
      1.300000
      1.000000
    
    
      max
      7.000000
      4.400000
      5.100000
      1.800000
      1.000000



In [71]:

    
# import Logistic regression classifier

from sklearn.linear_model import LogisticRegression



In [72]:

    
# Assign features to X and Class(label) to y

# Selecting all rows and all columns besides the last two
X = df_two.iloc[:, :-2].values

# Select all rows and the last colomn
y = df_two.iloc[:, -1]



In [73]:

    
# Split data into a training and testing datasets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)



In [74]:

    
#  Initialize the model
logistic = LogisticRegression()

# train the model
logistic.fit(X_train, y_train)

# Get accuracy score of testing data
score = logistic.score(X_test, y_test)

print("Accuracy is: {}%".format(score.round(4) * 100))









    



Accuracy is: 100.0%



In [75]:

    
# Predict probabilities instead of labels

logistic.predict_proba(X_test)[:5]









    Out[75]:





array([[ 0.00225536,  0.99774464],
       [ 0.01429386,  0.98570614],
       [ 0.00663398,  0.99336602],
       [ 0.94522101,  0.05477899],
       [ 0.94812131,  0.05187869]])

Practice time!

In the example above we used only 2 classes of flowers to build a classifier. This time you need to use all three classes to build a classifier including some preprocessing steps.



In [76]:

    
df = pd.read_csv(data_url)
df.head()









    Out[76]:







  
    
      
      Unnamed: 0
      Sepal.Length
      Sepal.Width
      Petal.Length
      Petal.Width
      Species
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2
      setosa



In [ ]:

	Unnamed: 0	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
0	1	5.1	3.5	1.4	0.2	setosa
1	2	4.9	3.0	1.4	0.2	setosa
2	3	4.7	3.2	1.3	0.2	setosa
3	4	4.6	3.1	1.5	0.2	setosa
4	5	5.0	3.6	1.4	0.2	setosa

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Class
count	100.000000	100.000000	100.000000	100.000000	100.000000
mean	5.471000	3.099000	2.861000	0.786000	0.500000
std	0.641698	0.478739	1.449549	0.565153	0.502519
min	4.300000	2.000000	1.000000	0.100000	0.000000
25%	5.000000	2.800000	1.500000	0.200000	0.000000
50%	5.400000	3.050000	2.450000	0.800000	0.500000
75%	5.900000	3.400000	4.325000	1.300000	1.000000
max	7.000000	4.400000	5.100000	1.800000	1.000000